In this document we analyze the PASS1B proteomics animal data of MoTrPAC. The flow steps are: (1) load the data from the google cloud bucket, (2) preprocess each dataset, (3) PCA analysis and correlation with metadata variables, and (4) flagging potential outliers.

# Specify the parameters required for the analysis
# This dir should have the motrpac-bic-norm-qc repo
repo_local_dir = "~/Desktop/repos/"
# Runnable command for gsutil
gsutil_cmd = "~/google-cloud-sdk/bin/gsutil"
# Where should the data be downloaded to
local_data_dir = "~/Desktop/MoTrPAC/data/pass_1b/proteomics/"
# Bucket structure is: tissue/placementtform/results/<data files>. For GET, this path should
# also have a qa_qc directory with the qc_metrics and sample_metadata files.
bucket = "gs://motrpac-release-data-staging/Results/pass1b-06/proteomics/"
# Specify bucket and local path for the phenotypic data
pheno_bucket = "gs://motrpac-internal-release2-results/pass1b-06/phenotype/"
pheno_local_dir = "~/Desktop/MoTrPAC/data/pass_1b/dmaqc_pheno/"

# specify parameters for filtering by missing values
max_na_freq = 0.66
# Should PCA use imputed data?
pca_using_impute=F

# Specify how many PCs to examine
num_pcs = 5
num_pcs_for_outlier_analysis = 3
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001

Specifiy the pipeline, metadata, and sample variables for the analysis.

# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("tmt_plex","num_NA")
biospec_cols = c("registration.sex","key.anirandgroup",
                 "registration.batchnumber",
                 "training.staffid",
                  "is_control",
                  "vo2.max.test.vo2_max_visit1", # this assumes that visit1's are aligned
                  "terminal.weight.mg","time_to_freeze",
                  "timepoint","bid","pid")
# load functions and libraries
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/gcp_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/qc_report_aux_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/config_session.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/association_analysis_methods.R"))

Load the data from the buckets

Download the data

obj = download_bucket_to_local_dir(bucket,local_path=local_data_dir,
             remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)

The code belows take the metadata and ration tables. For raw intensities (rii) - specify the matrix regex in “parse_proteomics_datasets_from_download_obj”.

obj = list(
  downloaded_files = list.files(local_data_dir,full.names = T,recursive = T),
  local_path = local_data_dir
)

proteomics_data = parse_proteomics_datasets_from_download_obj(
  obj,local_path=local_data_dir,remove_prev_files = T,GSUTIL_PATH=gsutil_cmd
)
## [1] "t55-gastrocnemius,prot-ph"
## [1] "t55-gastrocnemius,prot-pr"
## [1] "t58-heart,prot-ph"
## [1] "t58-heart,prot-pr"
## [1] "t68-liver,prot-ph"
## [1] "t68-liver,prot-pr"
## [1] "t70-white-adipose,prot-ph"
## [1] "t70-white-adipose,prot-pr"
failed_datasets = proteomics_data$failed_datasets
proteomics_data = proteomics_data$proteomics_parsed_datasets
# this is commented out: the printed buckets here will refer to the "ac" datasets,
# and these were not submitted for this release.
# if(length(failed_datasets)>1){
#   print("Reading the data failed form some buckets, details:")
#   write.table(failed_datasets,row.names = F,quote=F,sep="\t",col.names = F)
# }

# Add the pheno data
pheno_data = parse_pheno_data(pheno_bucket,local_path = pheno_local_dir,
             remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
# add a tissue variable (for convinience)
pheno_data$viallabel_data$tissue = 
  pheno_data$viallabel_data$specimen.processing.sampletypedescription
# add the time to freeze variable ((for convinience))
pheno_data$viallabel_data$time_to_freeze = 
  pheno_data$viallabel_data$calculated.variables.frozetime_after_train - 
  pheno_data$viallabel_data$calculated.variables.deathtime_after_train
# add a binary is_control variable
pheno_data$viallabel_data$is_control = as.numeric(grepl("control",
  pheno_data$viallabel_data$key.anirandgroup,ignore.case = T))
# add the timepoint as a number
# x - the text description of the group
get_numeric_timepoint<-function(x){
  v = rep(0,length(x))
  tps = c("Eight"=8,"Four"=4)
  for(tp in names(tps)){
    v[grepl(paste0(tp,"-week"),x,perl = T,ignore.case = T)] = tps[tp]
  }
  return(v)
}
pheno_data$viallabel_data$timepoint = get_numeric_timepoint(
  pheno_data$viallabel_data$key.anirandgroup
)

Normalize and filter the data

We use vp.misc’s remove_batch_effects to account for plex variation. We also keep a version of the data with imputed missing values. The new data objects are saved into the proteomics_data list.

for(dataset_name in names(proteomics_data)){
  print(paste("processing",dataset_name))
  curr_data  = proteomics_data[[dataset_name]]$sample_data
  # median value normalization
  sample_meta = proteomics_data[[dataset_name]]$sample_meta[colnames(curr_data),]
  
  # Exclude features with many NAs
  row_NA_data = rowSums(is.na(curr_data))/ncol(curr_data)
  feature_inds = row_NA_data < max_na_freq
  # filter and normalize
  norm_data = as.matrix(curr_data[feature_inds,])
  norm_data = run_median_mad_norm(norm_data)
  # b for batch corrected
  norm_data_b = limma::removeBatchEffect(norm_data,sample_meta$tmt_plex)
  
  # Impute - use capture.output to avoid prints 
  # This will be used for PCA
  capture.output({
    norm_data_b_imp = impute::impute.knn(as.matrix(norm_data_b))$data
    norm_data_imp = impute::impute.knn(as.matrix(norm_data))$data
  },file=NULL)
  
  # Median normalization (per MOP)
  # norm_data_imp = run_median_mad_norm(norm_data_imp)
  if(any(is.na(norm_data_imp))){
    print(paste("Imputation failed in:",dataset_name))
  }
  
  proteomics_data[[dataset_name]][["norm_filtered"]] = list(
    norm_data_b = norm_data_b,
    norm_data_b_imp = norm_data_b_imp,
    norm_data = norm_data,
    norm_data_imp = norm_data_imp,
    feature_inds = feature_inds
  )
  print(paste0('done, objects saved into proteomics_data[["',
               dataset_name,'"]][[,norm_filtered]]'))
}

Examine the effect of the normalization process, which includes removing features with many NAs. For each dataset show the boxplots of three randomly selected plexes.

for(i in 1:length(proteomics_data)){
  unnorm_data = proteomics_data[[i]]$sample_data
  norm_data = proteomics_data[[i]][["norm_filtered"]][[1]]
  plex = proteomics_data[[i]]$sample_meta[colnames(norm_data),"tmt_plex"]
  plex = as.factor(plex)
  # order the data by the plexes
  ord = order(plex)
  unnorm_data = unnorm_data[,ord]
  norm_data = norm_data[,ord]
  plex = plex[ord]
  # take a subset of the plexes
  samp_plex = sample(unique(plex))[1:3]
  samp = which(plex %in% samp_plex)
  # plot
  curr_name = names(proteomics_data)[i]
  colnames(unnorm_data) = NULL;colnames(norm_data) = NULL
  boxplot(unnorm_data[,samp],
          main=paste0(curr_name,"\nraw data (",nrow(unnorm_data)," features)"),
          ylab = "log2 ratio",xaxt="n",col=plex)
  boxplot(norm_data[,samp],
          main=paste0(curr_name,"\nnorm data (ratios) (",nrow(norm_data)," features)"),
          ylab = "log2 ratio",xaxt="n",col=plex)
  
  # # Check with and without batch effects analysis
  # par(mfrow=c(1,3))
  # norm_data = proteomics_data[[i]][["norm_filtered"]][[2]]
  # boxplot(norm_data[,samp],
  #         main=paste0(curr_name,"\nnorm data (",nrow(norm_data)," features)"),
  #         ylab = "log2 ratio",xaxt="n",col=plex)
  # norm_data2 = limma::removeBatchEffect(norm_data,plex)
  # es = ExpressionSet(norm_data)
  # es$plex = plex
  # norm_data3 = remove_batch_effect(es,"plex")
  # norm_data3 = exprs(norm_data3)
  # boxplot(norm_data2[,samp],
  #         main=paste0(curr_name,"\nnorm data + limma (",nrow(norm_data)," features)"),
  #         ylab = "log2 ratio",xaxt="n",col=plex)
  # boxplot(norm_data3[,samp],
  #         main=paste0(curr_name,"\nnorm data + vp.misc (",nrow(norm_data)," features)"),
  #         ylab = "log2 ratio",xaxt="n",col=plex)
  # 
  # norm_data4 = norm_data4
  # for(k in 1:5){
  #   norm_data4 = run_median_mad_norm(norm_data4)
  #   norm_data4 = limma::removeBatchEffect(norm_data4,plex)
  # }
  # boxplot(norm_data4[,samp],
  #         main=paste0(curr_name,"\nnorm data + vp.misc (",nrow(norm_data)," features)"),
  #         ylab = "log2 ratio",xaxt="n",col=plex)
}

PCA before and after adjusting for batch

Run PCA analysis on the imputed data, with and without mitigating batch effects:

for(dataset_name in names(proteomics_data)){
  data_obj = proteomics_data[[dataset_name]]
  data_samples = colnames(data_obj[["norm_filtered"]][["norm_data_b"]])
  randgroup = pheno_data$viallabel_data[data_samples,"key.anirandgroup"]
  sex = pheno_data$viallabel_data[data_samples,"registration.sex"]
  sex[sex=="1"] = "F";sex[sex=="2"]="M"
  plex = data_obj$sample_meta[data_samples,"tmt_plex"]
  
  # pca after removing batch effects
  if(pca_using_impute){
    curr_data = data_obj[["norm_filtered"]][["norm_data_b_imp"]]
  }
  else{
    curr_data = data_obj[["norm_filtered"]][["norm_data_b"]]
    curr_data = curr_data[rowSums(is.na(curr_data))==0,]
  }
  curr_pca = prcomp(t(curr_data),scale. = T,center = T)
  curr_pcax = curr_pca$x[,1:num_pcs]
  explained_var = summary(curr_pca)[["importance"]][3,5]
  df = data.frame(curr_pcax,sex=sex,plex=plex,randgroup = randgroup)
  proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]] = list(
    pcax = df,explained_var=explained_var
  )
  
  # pca without removing batch effects
  if(pca_using_impute){
    curr_data = data_obj[["norm_filtered"]][["norm_data_imp"]]
  }
  else{
    curr_data = data_obj[["norm_filtered"]][["norm_data"]]
    curr_data = curr_data[rowSums(is.na(curr_data))==0,]
  }
  curr_pca = prcomp(t(curr_data),scale. = T,center = T)
  curr_pcax = curr_pca$x[,1:num_pcs]
  explained_var = summary(curr_pca)[["importance"]][3,5]
  df = data.frame(curr_pcax,sex=sex,plex=plex,randgroup = randgroup)
  proteomics_data[[dataset_name]][["norm_filtered"]][["pca_w_b"]] = list(
    pcax = df,explained_var=explained_var
  )
}

PCA plots:

for(dataset_name in names(proteomics_data)){
  df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_w_b"]]$pcax
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=plex, shape=sex)) +
      ggtitle(paste(dataset_name,"before batch effects analysis",sep=": "))
  plot(p)
  df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]]$pcax
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=plex, shape=sex)) +
      ggtitle(paste(dataset_name,"after batch effects analysis",sep=": "))
  plot(p)
}

PCA visualization (sex and group)

We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.

for(dataset_name in names(proteomics_data)){
  df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]]$pcax
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=randgroup, shape=sex)) +
      ggtitle(dataset_name)
  plot(p)
}

Correlations with clinical variables

Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance. For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.

pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(dataset_name in names(proteomics_data)){
  # Get the projected PCs
  curr_pcax = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]][[1]]
  curr_pcax = curr_pcax[,grepl("^PC",names(curr_pcax))]
  curr_pipeline_meta = proteomics_data[[dataset_name]]$sample_meta
  curr_data_samples = rownames(curr_pcax)
  curr_meta = pheno_data$viallabel_data[curr_data_samples,biospec_cols]
  # remove metadata variables with NAs
  curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
   # remove bid and pid
  curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
  # Add batch
  curr_meta$plex = as.numeric(
    as.factor(curr_pipeline_meta[curr_data_samples,"tmt_plex"]))
  curr_meta_numeric_cols = sapply(curr_meta,is.numeric)
  # Remove zero sd columns
  curr_meta_numeric_cols[curr_meta_numeric_cols] =
    apply(curr_meta[,curr_meta_numeric_cols],2,sd)>0
  
  corrs = cor(curr_pcax,curr_meta[,curr_meta_numeric_cols],method="spearman")
  corrsp = pairwise_eval(
    curr_pcax,curr_meta[,curr_meta_numeric_cols],func=pairwise_association_wrapper,
    f=1,max_num_vals_for_discrete=8)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
    ggtitle(dataset_name) +
    theme(plot.title = element_text(hjust = 0.5,size=20)))
  
  for(i in 1:nrow(corrsp)){
    for(j in 1:ncol(corrsp)){
      if(corrsp[i,j]>p_thr){next}
      pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
            c(dataset_name,
              rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
            )
    }
  }
  colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
                                  "qc_metric","rho(spearman)","p-value")
  
  ####
  # compute correlations among the metadata variables
  ####
  corrs = cor(curr_meta[,curr_meta_numeric_cols],method="spearman")
  corrsp = pairwise_eval(
    curr_meta[,curr_meta_numeric_cols],func=pairwise_association_wrapper,
    f=1,max_num_vals_for_discrete=8)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(corrs,lab=T,lab_size=2.5,hc.order = T))
  
  for(n1 in rownames(corrsp)){
    for(n2 in rownames(corrsp)){
      if(n1==n2){break}
      if(n1 %in% biospec_cols &&
         n2 %in% biospec_cols) {next}
      if(corrsp[n1,n2]>p_thr){next}
      metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
            c(dataset_name,n1,n2,corrs[n1,n2],corrsp[n1,n2])
      )
    }
  }
  if(!is.null(dim(metadata_variable_assoc_report))){
    colnames(metadata_variable_assoc_report) = c(
      "Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
  }

}

# Format the reports - for a nicer presentation in a table
pcs_vs_qc_var_report[,5] = format(
  as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
  as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
  as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
  as.numeric(metadata_variable_assoc_report[,4]),digits=3)

PCA outliers

In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.

Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples. Moreover, note that we plot outliers for the rii (raw intensitied) data as well. Typically, outliers from such datasets will not appear in the bettern normalized ratio data.

pca_outliers_report = c()
for(dataset_name in names(proteomics_data)){
  # Get the projected PCs
  curr_pcax = proteomics_data[[dataset_name]][["norm_filtered"]][["pca"]][[1]]  
  # Univariate: use IQRs
  pca_outliers = c()
  for(j in 1:num_pcs_for_outlier_analysis){
    outlier_values <- boxplot.stats(curr_pcax[,j],coef = 3)$out
    for(outlier in names(outlier_values)){
      pca_outliers_report = rbind(pca_outliers_report,
       c(dataset_name,paste("PC",j,sep=""),outlier,
         format(outlier_values[outlier],digits=5))
      )
      if(!is.element(outlier,names(pca_outliers))){
        pca_outliers[outlier] = outlier_values[outlier]
      }
    }
  }
  
  # Plot the outliers
  if(length(pca_outliers)>0){
    df = data.frame(curr_pcax,
                outliers = rownames(curr_pcax) %in% names(pca_outliers))
    col = rep("black",nrow(df))
    col[df$outliers] = "green"
    plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(dataset_name,"flagged outliers"),
         xlab = "PC1",ylab="PC2")
    plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(dataset_name,"flagged outliers"),
         xlab = "PC3",ylab="PC4")
  }
}

if(!is.null(dim(pca_outliers_report))){
  colnames(pca_outliers_report) =  c("dataset","PC","sample","score")
}

Sex prediction accuracy

Here we map features to their chromosomes and then use the X and Y chromosomes to train a classifier for predicting sex. Note that unlike the sequencing platforms we use the molecular features from the data, which requires running a regularized learning algorithm. We use linear SVM as it seems to produce reasonable results.

entrez2chr = as.list(org.Rn.egCHR)
symbol2entrez = as.list(org.Rn.egSYMBOL2EG)
sex_pred_err_rates = c()
sex_check_outliers = c()
for(dataset_name in names(proteomics_data)){
  curr_data = proteomics_data[[dataset_name]][["norm_filtered"]][[2]]
  curr_annot = proteomics_data[[dataset_name]]$row_annot
  # get the correct row annotation (was filtered by NAs above)
  feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
  curr_annot = curr_annot[feature_inds,]
  # map to entrez ids
  curr_entrez = symbol2entrez[curr_annot[,"gene_symbol"]]
  # take rows with a unique entrez id
  single_entrez  = sapply(curr_entrez,length)==1
  curr_data = curr_data[single_entrez,]
  curr_annot = curr_annot[single_entrez,]
  curr_entrez = unlist(curr_entrez[single_entrez])
  # map to chromosomes
  curr_chrs = entrez2chr[curr_entrez]
  # limit the data to X and Y chromosomes
  chrr_y_genes = sapply(curr_chrs,function(x)any(x=="Y" | x=="X"))
  chrr_y_genes[is.na(chrr_y_genes)] = F
  if(sum(chrr_y_genes,na.rm=T)<2){next}
  sex = pheno_data$viallabel_data[colnames(curr_data),"registration.sex"]
  df = data.frame(sex=as.factor(sex),t(curr_data[chrr_y_genes,]))
  df = df[,colSums(is.na(df))==0]
  # run an svm classifier
  train_control <- caret::trainControl(method = "cv", number = nrow(df),
                                savePredictions = TRUE)
  # train the model on training set
  model <- caret::train(sex ~ .,data = df,
               trControl = train_control,method = "svmLinear2")
  # loocv results
  cv_res = model$pred
  cv_res = cv_res[cv_res[,"cost"]==0.5,]
  cv_errors = cv_res[,1] != cv_res[,2]
  acc = 1 - (sum(cv_errors)/nrow(cv_res))
  print(paste(
    "In dataset",dataset_name,
    "sex prediction using X,Y chromosomes, cross-validation accuracy is:",
    round(acc,digits = 3)))
  sex_pred_err_rates = rbind(sex_pred_err_rates,
      c(dataset_name,mean(cv_errors))
  )
  err_samples = rownames(df)[cv_errors]
  for(samp in err_samples){
    sex_check_outliers = rbind(
      sex_check_outliers,c(dataset_name,samp))
  }
}
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t55-gastrocnemius,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.967"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t55-gastrocnemius,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.983"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t58-heart,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t58-heart,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t68-liver,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t68-liver,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.983"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t70-white-adipose,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t70-white-adipose,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
if(! is.null(dim(sex_check_outliers))){
  colnames(sex_check_outliers) = c("dataset","sample")
}

Save the normalized data in the bucket

# use the raw bucket names as the output location
# (implied assumption: rna_seq_data and norm_rnaseq_data have the same order)

# When the release data become ready - out_bucket
# should be the same as the input (out_bucket == bucket)
out_bucket = bucket
for(dataset_name in names(proteomics_data)){
  x = proteomics_data[[dataset_name]][["norm_filtered"]][[1]]
  x_imp = proteomics_data[[dataset_name]][["norm_filtered"]][[2]]
  x_annot = proteomics_data[[dataset_name]]$row_annot
  # get the correct row annotation (was filtered by NAs above)
  feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
  x_annot = x_annot[feature_inds,]
  out_b = paste0(out_bucket,gsub(",","/",dataset_name),"/")
  rownames(x) = 1:nrow(x)
  rownames(x_annot) = rownames(x)
  # add bid and pid
  x_bids = pheno_data$viallabel_data[colnames(x),"bid"]
  x_pids = pheno_data$viallabel_data[colnames(x),"pid"]
  newx = x
  for(j in 1:ncol(newx)){newx[,j] = round(as.numeric(newx[,j]),digits=5)}
  newx = rbind(x_bids,newx)
  newx = rbind(x_pids,newx)
  newx = rbind(colnames(x),newx)
  colnames(newx) = NULL
  
  newx_imp = x_imp
  for(j in 1:ncol(newx_imp)){newx_imp[,j] = round(as.numeric(newx_imp[,j]),digits=5)}
  newx_imp = rbind(x_bids,newx_imp)
  newx_imp = rbind(x_pids,newx_imp)
  newx_imp = rbind(colnames(x_imp),newx_imp)
  colnames(newx_imp) = NULL
  
  # a simple sanity check
  m = newx[-c(1:3),]
  mode(m) = "numeric"
  if(!all(abs(m-x)< 1e-05,na.rm=T)){
    print("Error in parsing the matrix, breaking")
    break
  }
  # a simple sanity check
  m = newx_imp[-c(1:3),]
  mode(m) = "numeric"
  if(!all(abs(m-x)< 1e-05,na.rm=T)){
    print("Error in parsing the matrix, breaking")
    break
  }
  
  rownames(newx)[1:3] = c("viallabel","pid","bid")
  rownames(newx_imp)[1:3] = c("viallabel","pid","bid")
  
  # save the output to the target bucket
  arr = strsplit(dataset_name,split=",")[[1]]
  curr_ome = tolower(arr[2])
  curr_t = tolower(arr[1])
  local_fname = paste0(local_data_dir,
                       "motrpac_pass1b-06_",curr_t,"_",curr_ome,
                       "_med-mad-normalized-logratio.txt")
  write.table(newx,file=local_fname,sep="\t",quote=F,
         row.names = T,col.names = F)
  cmd = paste(gsutil_cmd,"cp",local_fname,out_b)
  system(cmd)
  local_fname_imp = paste0(local_data_dir,
                       "motrpac_pass1b-06_",curr_t,"_",curr_ome,
                       "_med-mad-normalized-imputed-logratio.txt")
  write.table(newx_imp,file=local_fname_imp,sep="\t",quote=F,
         row.names = T,col.names = F)
  cmd = paste(gsutil_cmd,"cp",local_fname_imp,out_b)
  system(cmd)
  local_fname2 = paste0(local_data_dir,
                       "motrpac_pass1b-06_",curr_t,"_",curr_ome,
                       "_normalized-data-feature-annot.txt")
  write.table(x_annot,file=local_fname2,sep="\t",quote=F,
         row.names = T,col.names = T)
  cmd = paste(gsutil_cmd,"cp",local_fname2,out_b)
  system(cmd)

}